1a Species Distribution Modeling- Exploratory Data Analysis

Install packages

# load packages, installing if missing
if (!require(librarian)){
  install.packages("librarian")
  library(librarian)
}
librarian::shelf(
  dismo, dplyr, DT, ggplot2, here, htmltools, leaflet, mapview, purrr, raster, readr, rgbif, rgdal, rJava, sdmpredictors, sf, spocc, tidyr, geojsonio)
select <- dplyr::select # overwrite raster::select

# set random seed for reproducibility
set.seed(42)

# directory to store data
dir_data <- here("data/sdm")
dir.create(dir_data, showWarnings = F)

Get species observations

American Black Bear

Ursus americanus (American Black Bear)

obs_csv <- file.path(dir_data, "obs.csv")
obs_geo <- file.path(dir_data, "obs.geojson")


# get species occurrence data from GBIF with coordinates
(res <- spocc::occ(
  query = 'Ursus americanus', 
  from = 'gbif', has_coords = T,
  limit = 10000))
## Searched: gbif
## Occurrences - Found: 19,547, Returned: 10,000
## Search type: Scientific
##   gbif: Ursus americanus (10000)
(res_large_limit <- spocc::occ(
  query = 'Ursus americanus', 
  from = 'gbif', has_coords = T,
  limit = 100000))
## Searched: gbif
## Occurrences - Found: 19,547, Returned: 19,547
## Search type: Scientific
##   gbif: Ursus americanus (19547)
# extract data frame from result
df <- res$gbif$data[[1]] 
readr::write_csv(df, obs_csv)
anyDuplicated(df)
## [1] 0
# convert to points of observation from lon/lat columns in data frame
obs <- df %>% 
  filter(basisOfRecord == "HUMAN_OBSERVATION") %>% 
  sf::st_as_sf(
    coords = c("longitude", "latitude"),
    crs = st_crs(4326)) %>% 
  select(prov, key) # save space (joinable from obs_csv)
sf::write_sf(obs, obs_geo, delete_dsn=T)

obs <- sf::read_sf(obs_geo)
nrow(obs) # number of rows
## [1] 9993
# show points on map
mapview::mapview(obs, map.types = "Esri.WorldImagery")

Question: How many observations total are in GBIF for your species?

There are 19547 total observations in GBIF for my species.

Question: Did you have to perform any data cleaning steps?

I do not see any odd observations in the data. The cleaning I had to do was minimal, and consisted only of filtering the basis of record to “HUMAN_OBSERVATION”

Get environmental data

dir_env <- file.path(dir_data, "env")

# set a default data directory
options(sdmpredictors_datadir = dir_env)

# choosing terrestrial
env_datasets <- sdmpredictors::list_datasets(terrestrial = TRUE, marine = FALSE)

# show table of datasets
#env_datasets %>% 
 # select(dataset_code, description, citation) %>% 
  #DT::datatable()
# choose datasets for a vector
env_datasets_vec <- c("WorldClim", "ENVIREM")

# get layers
env_layers <- sdmpredictors::list_layers(env_datasets_vec)
#DT::datatable(env_layers)

Question: What environmental layers did you choose as predictors? Can you find any support for these in the literature?

The layers I chose as predictors are WC_alt (altitude), WC_bio1 (annual mean temperature), WC_bio11 (mean temperature of coldest quarter), WC_bio10 (mean temperature of warmest quarter), WC_bio4 (temperature seasonality), ER_tri (terrain roughness index), ER_topoWet (topographic wetness), WC_bio12 (annual precipitation), WC_bio7 (annual temperature range), latitude, and longitude. Since my prior knowledge of black bear habitat is limited and comes mostly from common knowledge, I decided to choose a wide range of environmental factors as possible predictors.

# choose layers after some inspection and perhaps consulting literature
env_layers_vec <- c("WC_alt", "WC_bio1", "WC_bio11", "WC_bio10", "WC_bio4", "ER_tri", "ER_topoWet", "WC_bio12", "WC_bio7")

# get layers
env_stack <- load_layers(env_layers_vec)

# interactive plot layers, hiding all but first (select others)
# mapview(env_stack, hide = T) # makes the html too big for Github
plot(env_stack, nc=2)

obs_hull_geo  <- file.path(dir_data, "obs_hull.geojson")
env_stack_grd <- file.path(dir_data, "env_stack.grd")


# make convex hull around points of observation
obs_hull <- sf::st_convex_hull(st_union(obs))
  
# save obs hull
write_sf(obs_hull, obs_hull_geo)

obs_hull <- read_sf(obs_hull_geo)

# show points on map
mapview(
  list(obs, obs_hull))
obs_hull_sp <- sf::as_Spatial(obs_hull)
env_stack <- raster::mask(env_stack, obs_hull_sp) %>% 
  raster::crop(extent(obs_hull_sp))
writeRaster(env_stack, env_stack_grd, overwrite=T)  

env_stack <- stack(env_stack_grd)

# show map
# mapview(obs) + 
#   mapview(env_stack, hide = T) # makes html too big for Github
plot(env_stack, nc=2)

## Pseudo-Absence

absence_geo <- file.path(dir_data, "absence.geojson")
pts_geo     <- file.path(dir_data, "pts.geojson")
pts_env_csv <- file.path(dir_data, "pts_env.csv")


# get raster count of observations
r_obs <- rasterize(
  sf::as_Spatial(obs), env_stack[[1]], field=1, fun='count')
  
# show map
#mapview(obs) + 
 mapview(r_obs)
# create mask for 
r_mask <- mask(env_stack[[1]] > -Inf, r_obs, inverse=T)

# generate random points inside mask
absence <- dismo::randomPoints(r_mask, nrow(obs)) %>% 
  as_tibble() %>% 
  st_as_sf(coords = c("x", "y"), crs = 4326)

write_sf(absence, absence_geo, delete_dsn=T)

absence <- read_sf(absence_geo)

# show map of presence, ie obs, and absence
mapview(obs, col.regions = "green") + 
  mapview(absence, col.regions = "gray")
# combine presence and absence into single set of labeled points 
pts <- rbind(
  obs %>% 
    mutate(
      present = 1) %>% 
    select(present, key),
  absence %>% 
    mutate(
      present = 0,
      key     = NA)) %>% 
mutate(
  ID = 1:n()) %>% 
relocate(ID)
write_sf(pts, pts_geo, delete_dsn=T)
# extract raster values for points
pts_env <- raster::extract(env_stack, as_Spatial(pts), df=TRUE) %>% 
  tibble() %>% 
  # join present and geometry columns to raster value results for points
  left_join(
    pts %>% 
      select(ID, present),
    by = "ID") %>% 
  relocate(present, .after = ID) %>% 
  # extract lon, lat as single columns
  mutate(
    #present = factor(present),
    lon = st_coordinates(geometry)[,1],
    lat = st_coordinates(geometry)[,2]) %>% 
  select(-geometry)
write_csv(pts_env, pts_env_csv)

pts_env <- read_csv(pts_env_csv)

pts_env %>% 
  # show first 10 presence, last 10 absence
  slice(c(1:10, (nrow(pts_env)-9):nrow(pts_env))) %>% 
  DT::datatable(
    rownames = F,
    options = list(
      dom = "t",
      pageLength = 20))

Term plots

pts_env %>% 
  select(-ID) %>% 
  mutate(
    present = factor(present)) %>% 
  pivot_longer(-present) %>% 
  ggplot() +
  geom_density(aes(x = value, fill = present)) + 
  scale_fill_manual(values = alpha(c("gray", "green"), 0.5)) +
  scale_x_continuous(expand=c(0,0)) +
  scale_y_continuous(expand=c(0,0)) +
  theme_bw() + 
  facet_wrap(~name, scales = "free") +
  theme(
    legend.position = c(1, 0),
    legend.justification = c(1, 0))

1b. Logistic Regression

librarian::shelf(
  DT, dplyr, dismo, GGally, here, readr, tidyr)
select <- dplyr::select # overwrite raster::select
options(readr.show_col_types = F)

dir_data    <- here("data/sdm")
pts_env_csv <- file.path(dir_data, "pts_env.csv")

pts_env <- read_csv(pts_env_csv)
nrow(pts_env)
## [1] 19986
datatable(pts_env, rownames = F)
GGally::ggpairs(
  select(pts_env, -ID),
  aes(color = factor(present)))

## Linear Model

# setup model data
d <- pts_env %>% 
  select(-ID) %>%  # remove terms we don't want to model
  tidyr::drop_na() # drop the rows with NA values
nrow(d)
## [1] 19923
# fit a linear model
mdl <- lm(present ~ ., data = d)
summary(mdl)
## 
## Call:
## lm(formula = present ~ ., data = d)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.19199 -0.34812  0.01004  0.32332  1.25480 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.723e+00  1.093e-01  34.075  < 2e-16 ***
## WC_alt      -1.960e-04  1.257e-05 -15.593  < 2e-16 ***
## WC_bio1      4.071e-02  1.004e-02   4.055 5.04e-05 ***
## WC_bio11    -6.478e-02  1.701e-02  -3.809  0.00014 ***
## WC_bio10    -1.406e-02  1.978e-02  -0.711  0.47717    
## WC_bio4     -1.324e-02  4.407e-03  -3.005  0.00266 ** 
## ER_tri      -6.332e-04  1.363e-04  -4.645 3.43e-06 ***
## ER_topoWet  -1.259e-01  3.526e-03 -35.705  < 2e-16 ***
## WC_bio12     1.345e-04  1.054e-05  12.764  < 2e-16 ***
## WC_bio7      7.510e-03  2.291e-03   3.278  0.00105 ** 
## lon         -1.075e-03  3.361e-04  -3.198  0.00139 ** 
## lat         -3.090e-02  1.893e-03 -16.325  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4186 on 19911 degrees of freedom
## Multiple R-squared:  0.2995, Adjusted R-squared:  0.2991 
## F-statistic: 773.8 on 11 and 19911 DF,  p-value: < 2.2e-16
y_predict <- predict(mdl, d, type="response")
y_true    <- d$present

range(y_predict)
## [1] -0.4135219  1.2680329
range(y_true)
## [1] 0 1

Generalized Linear Model

mdl <- glm(present ~ ., family = binomial(link="logit"), data = d)
summary(mdl)
## 
## Call:
## glm(formula = present ~ ., family = binomial(link = "logit"), 
##     data = d)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.8020  -0.8368  -0.1280   0.8262   2.9836  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  1.745e+01  7.316e-01  23.848  < 2e-16 ***
## WC_alt      -1.026e-03  8.141e-05 -12.600  < 2e-16 ***
## WC_bio1      3.916e-01  6.435e-02   6.085 1.17e-09 ***
## WC_bio11    -5.714e-01  1.016e-01  -5.626 1.84e-08 ***
## WC_bio10    -3.278e-02  1.165e-01  -0.281  0.77842    
## WC_bio4     -1.085e-01  2.611e-02  -4.157 3.22e-05 ***
## ER_tri      -4.147e-03  8.332e-04  -4.977 6.45e-07 ***
## ER_topoWet  -6.803e-01  2.251e-02 -30.226  < 2e-16 ***
## WC_bio12     8.404e-04  7.111e-05  11.818  < 2e-16 ***
## WC_bio7      4.029e-02  1.361e-02   2.960  0.00307 ** 
## lon         -8.724e-03  2.031e-03  -4.296 1.74e-05 ***
## lat         -1.706e-01  1.207e-02 -14.133  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 27619  on 19922  degrees of freedom
## Residual deviance: 20676  on 19911  degrees of freedom
## AIC: 20700
## 
## Number of Fisher Scoring iterations: 4
y_predict <- predict(mdl, d, type="response")

range(y_predict)
## [1] 0.005363644 0.986350892
termplot(mdl, partial.resid = TRUE, se = TRUE, main = F, ylim="free")

Generalized additive model

librarian::shelf(mgcv)

# fit a generalized additive model with smooth predictors
mdl <- mgcv::gam(
  formula = present ~ s(WC_alt) + s(WC_bio1) + 
    s(WC_bio11) + s(WC_bio10) + s(WC_bio4) + s(ER_tri) + s(ER_topoWet) + s(WC_bio12) + s(WC_bio7) + s(lon) + s(lat), 
  family = binomial, data = d)
summary(mdl)
## 
## Family: binomial 
## Link function: logit 
## 
## Formula:
## present ~ s(WC_alt) + s(WC_bio1) + s(WC_bio11) + s(WC_bio10) + 
##     s(WC_bio4) + s(ER_tri) + s(ER_topoWet) + s(WC_bio12) + s(WC_bio7) + 
##     s(lon) + s(lat)
## 
## Parametric coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.38553    0.03145  -12.26   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                 edf Ref.df Chi.sq p-value    
## s(WC_alt)     8.203  8.815 218.04  <2e-16 ***
## s(WC_bio1)    8.182  8.799 370.16  <2e-16 ***
## s(WC_bio11)   8.971  8.995 228.46  <2e-16 ***
## s(WC_bio10)   7.161  7.834 258.38  <2e-16 ***
## s(WC_bio4)    8.986  8.999 271.33  <2e-16 ***
## s(ER_tri)     8.169  8.809  80.35  <2e-16 ***
## s(ER_topoWet) 6.665  7.812 227.22  <2e-16 ***
## s(WC_bio12)   8.835  8.991 390.23  <2e-16 ***
## s(WC_bio7)    8.585  8.941 134.50  <2e-16 ***
## s(lon)        8.714  8.976 254.46  <2e-16 ***
## s(lat)        8.967  8.999 575.16  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.501   Deviance explained = 43.1%
## UBRE = -0.20174  Scale est. = 1         n = 19923
# show term plots
plot(mdl, scale=0)

Question: Which GAM environmental variables, and even range of values, seem to contribute most towards presence (above 0 response) versus absence (below 0 response)?

The variables with the greatest range of values were WC_bio1 and WC_bio10 contributing greately to both presence and absence. WC_bio1 had values from -60 to 30 and WC_bio10 had values between -40 and 50.

Maxent

# load extra packages
librarian::shelf(
  maptools, sf)

mdl_maxent_rds <- file.path(dir_data, "mdl_maxent.rds")

# show version of maxent
if (!interactive())
  maxent()
## This is MaxEnt version 3.4.3
# get environmental rasters
# NOTE: the first part of Lab 1. SDM - Explore got updated to write this clipped environmental raster stack
env_stack_grd <- file.path(dir_data, "env_stack.grd")
env_stack <- stack(env_stack_grd)
plot(env_stack, nc=2)

# get presence-only observation points (maxent extracts raster values for you)
obs_geo <- file.path(dir_data, "obs.geojson")
obs_sp <- read_sf(obs_geo) %>% 
  sf::as_Spatial() # maxent prefers sp::SpatialPoints over newer sf::sf class

# fit a maximum entropy model

mdl <- maxent(env_stack, obs_sp)
## This is MaxEnt version 3.4.3
readr::write_rds(mdl, mdl_maxent_rds)

mdl <- read_rds(mdl_maxent_rds)

# plot variable contributions per predictor
plot(mdl)

response(mdl)

Question: Which Maxent environmental variables, and even range of values, seem to contribute most towards presence (closer to 1 response) and how might this differ from the GAM results?

The variables that seem to contribute most towards presence in the maxent model are WC_alt (altitude), WC_bio1 (annual mean temperature), WC_bio11 (mean temperature of coldest quarter), and WC_bio7 (annual temperature range) compared to the GAM model which determined WC_bio1 (annual mean temperature) and WC_bio10 (mean temperature of warmest quarter) to contribute the most.

# predict
y_predict <- predict(env_stack, mdl) #, ext=ext, progress='')

plot(y_predict, main='Maxent, raw prediction')
data(wrld_simpl, package="maptools")
plot(wrld_simpl, add=TRUE, border='dark grey')

1c. Decision Trees

# load packages
librarian::shelf(
  caret,       # m: modeling framework
  dplyr, ggplot2 ,here, readr, 
  pdp,         # X: partial dependence plots
  ranger,      # m: random forest modeling
  rpart,       # m: recursive partition modeling
  rpart.plot,  # m: recursive partition plotting
  rsample,     # d: split train/test data
  skimr,       # d: skim summarize data table
  vip)         # X: variable importance

# options
options(
  scipen = 999,
  readr.show_col_types = F)
set.seed(42)

# graphical theme
ggplot2::theme_set(ggplot2::theme_light())

# paths
dir_data    <- here("data/sdm")
pts_env_csv <- file.path(dir_data, "pts_env.csv")

# read data
pts_env <- read_csv(pts_env_csv)
d <- pts_env %>% 
  select(-ID) %>%                   # not used as a predictor x
  mutate(
    present = factor(present)) %>%  # categorical response
  na.omit()                         # drop rows with NA
skim(d)
Data summary
Name d
Number of rows 19923
Number of columns 12
_______________________
Column type frequency:
factor 1
numeric 11
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
present 0 1 FALSE 2 0: 9969, 1: 9954

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
WC_alt 0 1 753.06 703.56 -68.00 236.00 472.00 1128.00 3639.00 ▇▂▂▁▁
WC_bio1 0 1 6.36 7.21 -12.90 1.40 6.10 11.30 24.30 ▁▅▇▅▂
WC_bio11 0 1 -5.33 10.36 -31.40 -11.80 -4.80 2.50 19.40 ▂▃▇▆▂
WC_bio10 0 1 17.52 5.20 -0.40 13.70 17.00 21.00 35.70 ▁▅▇▃▁
WC_bio4 0 1 89.22 28.59 19.90 70.30 85.41 108.38 166.89 ▂▇▇▅▂
ER_tri 0 1 43.04 48.03 0.00 6.96 22.90 67.33 278.86 ▇▂▁▁▁
ER_topoWet 0 1 10.72 1.92 6.58 9.08 10.80 12.32 15.11 ▃▇▇▇▂
WC_bio12 0 1 829.18 481.21 52.00 459.00 731.00 1112.00 3395.00 ▇▇▁▁▁
WC_bio7 0 1 37.91 8.19 13.10 32.90 37.80 43.50 56.50 ▁▂▇▆▃
lon 0 1 -101.22 20.72 -161.54 -118.71 -102.38 -82.79 -53.96 ▁▆▇▇▃
lat 0 1 44.67 9.34 23.29 37.51 44.44 50.59 67.66 ▂▆▇▅▂

Split data into training and testing

# create training set with 80% of full data
d_split  <- rsample::initial_split(d, prop = 0.8, strata = "present")
d_train  <- rsample::training(d_split)

# show number of rows present is 0 vs 1
table(d$present)
## 
##    0    1 
## 9969 9954
table(d_train$present)
## 
##    0    1 
## 7975 7963

Decision Trees

Partition, depth = 1

# run decision stump model
mdl <- rpart(
  present ~ ., data = d_train, 
  control = list(
    cp = 0, minbucket = 5, maxdepth = 1))
mdl
## n= 15938 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 15938 7963 0 (0.5003765 0.4996235)  
##   2) ER_topoWet>=10.975 7520 2095 0 (0.7214096 0.2785904) *
##   3) ER_topoWet< 10.975 8418 2550 1 (0.3029223 0.6970777) *
# plot tree 
par(mar = c(1, 1, 1, 1))
rpart.plot(mdl)

Partition, depth = default

# decision tree with defaults
mdl <- rpart(present ~ ., data = d_train)
mdl
## n= 15938 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 15938 7963 0 (0.50037646 0.49962354)  
##    2) ER_topoWet>=10.975 7520 2095 0 (0.72140957 0.27859043)  
##      4) WC_bio12< 703.5 3658  441 0 (0.87944232 0.12055768) *
##      5) WC_bio12>=703.5 3862 1654 0 (0.57172450 0.42827550)  
##       10) WC_bio1< 0.95 432   32 0 (0.92592593 0.07407407) *
##       11) WC_bio1>=0.95 3430 1622 0 (0.52711370 0.47288630)  
##         22) lon< -81.8741 1988  655 0 (0.67052314 0.32947686)  
##           44) lat< 44.95915 1453  314 0 (0.78389539 0.21610461) *
##           45) lat>=44.95915 535  194 1 (0.36261682 0.63738318) *
##         23) lon>=-81.8741 1442  475 1 (0.32940361 0.67059639) *
##    3) ER_topoWet< 10.975 8418 2550 1 (0.30292231 0.69707769)  
##      6) WC_bio11< -12.85 996  172 0 (0.82730924 0.17269076) *
##      7) WC_bio11>=-12.85 7422 1726 1 (0.23255187 0.76744813)  
##       14) WC_bio12< 416.5 839  311 0 (0.62932062 0.37067938)  
##         28) lat< 46.66194 677  181 0 (0.73264402 0.26735598) *
##         29) lat>=46.66194 162   32 1 (0.19753086 0.80246914) *
##       15) WC_bio12>=416.5 6583 1198 1 (0.18198390 0.81801610) *
rpart.plot(mdl)

# plot complexity parameter
plotcp(mdl)

Feature interpretation

# caret cross validation results
mdl_caret <- train(
  present ~ .,
  data       = d_train,
  method     = "rpart",
  trControl  = trainControl(method = "cv", number = 10),
  tuneLength = 20)

ggplot(mdl_caret)

vip(mdl_caret, num_features = 40, bar = FALSE)

Question: what are the top 3 most important variables of your model?

The 3 most important variables of the model are WC_bio11 (mean temperature of coldest quarter), WC_bio4 (temperature seasonality), and latitude.

# Construct partial dependence plots
p1 <- partial(mdl_caret, pred.var = "lat") %>% autoplot()
p2 <- partial(mdl_caret, pred.var = "WC_bio11") %>% autoplot()
p3 <- partial(mdl_caret, pred.var = c("lat", "WC_bio11")) %>% 
  plotPartial(levelplot = FALSE, zlab = "yhat", drape = TRUE, 
              colorkey = TRUE, screen = list(z = -20, x = -60))

# Display plots side by side
gridExtra::grid.arrange(p1, p2, p3, ncol = 3)

Random Forests

Fit

# number of features
n_features <- length(setdiff(names(d_train), "present"))

# fit a default random forest model
mdl_rf <- ranger(present ~ ., data = d_train)

# get out of the box RMSE
(default_rmse <- sqrt(mdl_rf$prediction.error))
## [1] 0.3148558

Feature interpretation

# re-run model with impurity-based variable importance
mdl_impurity <- ranger(
  present ~ ., data = d_train,
  importance = "impurity")

# re-run model with permutation-based variable importance
mdl_permutation <- ranger(
  present ~ ., data = d_train,
  importance = "permutation")
p1 <- vip::vip(mdl_impurity, bar = FALSE)
p2 <- vip::vip(mdl_permutation, bar = FALSE)

gridExtra::grid.arrange(p1, p2, nrow = 1)

Question: How might variable importance differ between rpart and RandomForest in your model outputs?

Variable importance could be calculated differently with different methods leading to different predictions.

1d. Evaluate Models

# load packages
librarian::shelf(
  dismo, # species distribution modeling: maxent(), predict(), evaluate(), 
  dplyr, ggplot2, GGally, here, maptools, readr, 
  raster, readr, rsample, sf,
  usdm)  # uncertainty analysis for species distribution models: vifcor()
select = dplyr::select

# options
set.seed(42)
options(
  scipen = 999,
  readr.show_col_types = F)
ggplot2::theme_set(ggplot2::theme_light())

# paths
dir_data      <- here("data/sdm")
pts_geo       <- file.path(dir_data, "pts.geojson")
env_stack_grd <- file.path(dir_data, "env_stack.grd")
mdl_maxv_rds  <- file.path(dir_data, "mdl_maxent_vif.rds")

# read points of observation: presence (1) and absence (0)
pts <- read_sf(pts_geo)

# read raster stack of environment
env_stack <- raster::stack(env_stack_grd)

Split observations into training and testing

# create training set with 80% of full data
pts_split  <- rsample::initial_split(
  pts, prop = 0.8, strata = "present")
pts_train  <- rsample::training(pts_split)
pts_test   <- rsample::testing(pts_split)

pts_train_p <- pts_train %>% 
  filter(present == 1) %>% 
  as_Spatial()
pts_train_a <- pts_train %>% 
  filter(present == 0) %>% 
  as_Spatial()

Calibrate: Model Selection

# show pairs plot before multicollinearity reduction with vifcor()
pairs(env_stack)

# calculate variance inflation factor per predictor, a metric of multicollinearity between variables
vif(env_stack)
##    Variables         VIF
## 1     WC_alt    4.656657
## 2    WC_bio1  538.646564
## 3   WC_bio11 3391.138926
## 4   WC_bio10 1138.941779
## 5    WC_bio4 1448.864541
## 6     ER_tri    4.255321
## 7 ER_topoWet    4.520922
## 8   WC_bio12    2.936697
## 9    WC_bio7   41.529852
# stepwise reduce predictors, based on a max correlation of 0.7 (max 1)
v <- vifcor(env_stack, th=0.7) 
v
## 4 variables from the 9 input variables have collinearity problem: 
##  
## WC_bio11 WC_bio4 WC_bio1 ER_topoWet 
## 
## After excluding the collinear variables, the linear correlation coefficients ranges between: 
## min correlation ( WC_bio10 ~ WC_alt ):  -0.1181368 
## max correlation ( WC_bio7 ~ WC_bio12 ):  -0.6266193 
## 
## ---------- VIFs of the remained variables -------- 
##   Variables      VIF
## 1    WC_alt 1.983334
## 2  WC_bio10 1.802469
## 3    ER_tri 2.249538
## 4  WC_bio12 2.506921
## 5   WC_bio7 3.114756
# reduce enviromental raster stack by 
env_stack_v <- usdm::exclude(env_stack, v)

# show pairs plot after multicollinearity reduction with vifcor()
pairs(env_stack_v)

# fit a maximum entropy model
if (!file.exists(mdl_maxv_rds)){
  mdl_maxv <- maxent(env_stack_v, sf::as_Spatial(pts_train))
  readr::write_rds(mdl_maxv, mdl_maxv_rds)
}
mdl_maxv <- read_rds(mdl_maxv_rds)

# plot variable contributions per predictor
plot(mdl_maxv)

Question: Which variables were removed due to multicollinearity and what is the rank of most to least important remaining variables in your model?

The variables removed include WC_bio11 (mean temperature of coldest quarter), WC_bio4 (temperature seasonality), WC_bio1 (annual mean temperature), and ER_topoWet (topographic wetness). The remaining variables in the order of most to least important are ER_tri (terrain roughness index), WC_bio12 (annual precipitation), WC_bio10 (mean temperature of warmest quarter), WC_bio7 (annual temperature range), and WC_alt (altitude).

# plot term plots
response(mdl_maxv)

# predict
y_maxv <- predict(env_stack, mdl_maxv) #, ext=ext, progress='')

plot(y_maxv, main='Maxent, raw prediction')
data(wrld_simpl, package="maptools")
plot(wrld_simpl, add=TRUE, border='dark grey')

Evaluate: Model Performance

pts_test_p <- pts_test %>% 
  filter(present == 1) %>% 
  as_Spatial()
pts_test_a <- pts_test %>% 
  filter(present == 0) %>% 
  as_Spatial()

y_maxv <- predict(mdl_maxv, env_stack)
#plot(y_maxv)

e <- dismo::evaluate(
  p     = pts_test_p,
  a     = pts_test_a, 
  model = mdl_maxv,
  x     = env_stack)
e
## class          : ModelEvaluation 
## n presences    : 1993 
## n absences     : 1992 
## AUC            : 0.866353 
## cor            : 0.6218343 
## max TPR+TNR at : 0.6316922
plot(e, 'ROC')

thr <- threshold(e)[['spec_sens']]
thr
## [1] 0.6316922
p_true <- na.omit(raster::extract(y_maxv, pts_test_p) >= thr)
a_true <- na.omit(raster::extract(y_maxv, pts_test_a) < thr)

# (t)rue/(f)alse (p)ositive/(n)egative rates
tpr <- sum(p_true)/length(p_true)
fnr <- sum(!p_true)/length(p_true)
fpr <- sum(!a_true)/length(a_true)
tnr <- sum(a_true)/length(a_true)

matrix(
  c(tpr, fnr,
    fpr, tnr), 
  nrow=2, dimnames = list(
    c("present_obs", "absent_obs"),
    c("present_pred", "absent_pred")))
##             present_pred absent_pred
## present_obs    0.8549925   0.2705823
## absent_obs     0.1450075   0.7294177
# add point to ROC plot
plot(e, 'ROC')
points(fpr, tpr, pch=23, bg="blue")

plot(y_maxv > thr)